#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2021/8/8 0008 0:10 
# @Author : zjl
# @File : handle_ryf_article_url.py

import requests
from lxml import etree

articles = [
    'https://www.ruanyifeng.com/blog/2021/08/weekly-issue-170.html'
]
article_list = [
    'https://www.ruanyifeng.com/blog/2021/08/weekly-issue-170.html'
]


def handle_article_index():
    """
    通过最新一页页面获取其它文章的访问链接
    :return:
    """
    while True:
        if len(articles):
            article_url = articles[0]
            articles.remove(article_url)
            resp = requests.get(article_url)
            resp.encoding = 'utf-8'
            page = etree.HTML(resp.text)
            other_article_url = ''.join(page.xpath('//div[@class="entry-location"]/ul/li[1]/a/@href'))

            if other_article_url:
                if other_article_url in article_list:
                    print('in article_list')
                    break

                article_list.append(other_article_url)
                articles.append(other_article_url)
                with open('./article_urls', 'w') as fw:
                    fw.write('\n'.join(article_list))
                fw.close()
            print(articles)

            if '上一篇' not in ''.join(page.xpath('//div[@class="entry-location"]/ul/li[1]/text()')):
                print('in 上一篇')
                print(''.join(page.xpath('//div[@class="entry-location"]/ul/li[1]/text()')))
                break


if __name__ == '__main__':
    handle_article_index()
